clear all
set matsize 10000
set more off
cap log close
cd "${master_dir}"
log using "${log_dir}/2-CreateAnalysisData.log", replace
***************************************************************************************************
* 
* Program: 2-CreateAnalysisDatasets.do
* Purpose: Create individual-level and county-level wait time datasets and CCES comparison dataset
* Sections:
*     1. Insheet and merge data and construct filters, wait times, and summary measures
*     2. Create county-level waittime dataset and construct and merge to other county-level data
*     3. Create CCES comparison dataset
*     4. Create datasets for maps produced in ArcGIS
* Files Used:
*     1. Pings_all_days.dta
*     2. likelyvoters.dta
*     3. IdentifiedRegularPingers.dta
*     4. block_group_data_2017.dta
*     5. countypres_2000-2016.csv
*     6. NCSL_2016laws.csv
*     7. online_table4-2.dta    
*     8. 2013-17-ACS-county-data.csv  
*     9. cb_2016_us_cd115_500k.shp   
*     10. CCES16_Common_OUTPUT_Feb2018_VV.dta
* Files Created:
*     1. 2-CreateAnalysisData.log
*     2. voterwaittimes.dta
*     3. county_vote_shares.dta
*     4. ncsl_laws.dta
*     5. Chetty_County_Effects.dta
*     6. county_demo.dta
*     7. county_voterwaittimes_limited.dta
*     8. county_voterwaittimes.dta
*     9. cd_data.dta & cd_coords.dta
*     10. cces.dta
*     11. cces_comparison.dta
* NOTES: Download and install ebayes.ado from: 
*         http://sacarny.com/wp-content/uploads/2015/08/ebayes.ado
*        Also download these packages:
*         ssc install statastates, replace
*         ssc install geoinpoly, replace
*         ssc install shp2dta, replace
*
***************************************************************************************************

***************************************************************************************************
*  1. Insheet and merge data and construct filters, wait times, and summary measures
***************************************************************************************************

* Load main dataset before merging in filter variables from other constructed datasets
use if day == td(08nov2016) & Dist_to_PollingPlace_M <= 60 ///
  using "${data_dir}/Pings_all_days.dta", clear

* Merge in "Likely Voter" Person Identifier (Filter)
merge m:1 ID_11_16 using "$data_dir/likelyvoters.dta", keepusing(likelyvoter_v1_d08 ///
  likelyvoter_v2_d08)
drop if _merge == 2
replace likelyvoter_v1_d08 = 0 if likelyvoter_v1_d08 == .
replace likelyvoter_v2_d08 = 0 if likelyvoter_v2_d08 == .
drop _merge
rename likelyvoter_v1_d08 likelyvoter_v1
rename likelyvoter_v2_d08 likelyvoter_v2

* Merge in "Regular Pinger" Person Info (# of unique hours pinged during Election Day)
merge m:1 ID_11_16 using "$data_dir/IdentifiedRegularPingers.dta"
drop if _merge == 2
drop _merge

* Generate "Consistent Pinger" filter (more than median number of hours in full data, i.e. >=12)
gen consistentpinger = (uniquepinghours >= 12 & uniquepinghours != .)

* Generate "Entered Polling Place" filter (did any ping on Election Day ever enter convex hull 
* of building)
bysort ID_11_16 PollingPlace_ID: egen enteredpoll = max(Ping_in_ConvexHull)

* Construct waittime variables
bys ID_11 PollingPlace_ID: egen double earliestping = min(local_date_sec)
bys ID_11 PollingPlace_ID: egen double latestping = max(local_date_sec)
gen double lowerbound = (latestping - earliestping)/1000

gen double sec_before_earliest_ping1 = Sec_Since_Last if earliestping == local_date_sec
bys ID_11 PollingPlace_ID: egen double sec_before_earliest_ping = max(sec_before_earliest_ping1)
drop sec_before_earliest_ping1

gen double sec_after_latest_ping1 = Sec_Till_Next_Ping if latestping == local_date_sec
bys ID_11 PollingPlace_ID: egen double sec_after_latest_ping = max(sec_after_latest_ping1)
drop sec_after_latest_ping1

gen double upperbound = lowerbound + sec_before_earliest_ping + sec_after_latest_ping
replace upperbound = upperbound / 60
replace lowerbound = lowerbound / 60

* Primary Wait Time Variable:
gen waittime = lowerbound + (upperbound - lowerbound)/2

* Wait Over 30 Minutes (Binary Outcome)
gen wait_over_30min = waittime >= 30
replace wait_over_30min = . if waittime == .

* Generate "Reasonable Values" Filter (More than 1 minute & less than 2 hours)
gen reasonablevalues = ((upperbound > 1 & upperbound != .) & (upperbound < (60*2)))

* Hour of arrival on Election Day (hour of earliest ping in the 60m radius)
gen hour_of_arrival = hh(earliestping)

* Merge in Census Demographic Data (by polling place's block group)
sort gisjoin
merge m:1 gisjoin using "$raw_dir/block_group_data_2017.dta"
drop if _merge == 2
drop _merge

* Divide population variables by 1000
replace pop = pop / 1000
replace pop_sqmi = pop_sqmi / 1000

* Define additional race variables
gen race_othernonwhite = race_native + race_other + race_multi
gen race_asianpi = race_asian + race_pac_islander
gen race_allnonwhite = race_black + race_asianpi + race_hispanic + race_othernonwhite

* Merge in State-Level Opening & Closing Times
* Handtranscribed from:
* https://ballotpedia.org/State_Poll_Opening_and_Closing_Times_(2016)#table
*** North Dakota open/close times by polling place available in /2016 Election/North Dakota/North Dakota 2016.xlsx
*** Vermont open/close times by polling place available in /2016 Election/Vermont/Vermont 2016.xlsx
* Set missing for state_names with clearly flexible rules (e.g. Idaho between 7 to 8am at discretion
* of county clerk, or Georgia closed at diff times depending on city size).
gen poll_open = 7
replace poll_open = 6 if inlist(state_name,"Arizona","Connecticut","Illinois","Indiana", ///
  "Kentucky") == 1
replace poll_open = 6 if inlist(state_name,"Louisiana","Missouri","New York","Virginia", ///
  "New Jersey") == 1
replace poll_open = 6.5 if inlist(state_name,"North Carolina","Ohio","West Virginia") == 1
replace poll_open = 7.5 if inlist(state_name,"Arkansas") == 1
replace poll_open = 8 if state_name == "Nebraska" & HourOffset == 6
replace poll_open = . if inlist(state_name,"Idaho","Kansas","Massachusetts","Minnesota", ///
  "New Hampshire","North Dakota","Tennessee","Vermont","Maine")

gen poll_close = 19
replace poll_close = 18 if inlist(state_name,"Kentucky","Indiana","Hawaii") == 1
replace poll_close = 19.5 if inlist(state_name,"North Carolina","Ohio","West Virginia") == 1
replace poll_close = 20 if inlist(state_name,"Alaska","California","Connecticut","Delaware","Idaho", ///
  "Louisiana","Maine","Maryland","Massachusetts") == 1
replace poll_close = 20 if inlist(state_name,"Michigan","Minnesota","Montana","Pennsylvania", ///
  "Rhode Island","Utah","District of Columbia","Wisconsin") == 1
replace poll_close = 20 if state_name == "Nebraska" & HourOffset == 6
replace poll_close = 20 if state_name == "Tennessee" & HourOffset == 5
replace poll_close = 21 if inlist(state_name,"Iowa","New York") == 1
replace poll_close = . if inlist(state_name,"Georgia","Kansas","New Hampshire","North Dakota") == 1 

* Define "Early" vs. "Late" Open and Close States
gen poll_open_early = 1 if poll_open == 6
replace poll_open_early = 0 if poll_open == 7
gen poll_close_early = 1 if poll_close == 19
replace poll_close_early = 0 if poll_close == 20

* Create ID tag
gsort -upperbound
egen tag_ID = tag(ID)
rename Apple_0_Google_1 android

* Keep one observation per voter
keep if tag_ID == 1

* Label Polling Place Level Variables
label var race_white "Fraction White"
label var race_black "Fraction Black"
label var race_othernonwhite "Fraction Other Non-White"
label var race_allnonwhite "Fraction Non-White"
label var race_hispanic "Fraction Hispanic"
label var race_asian "Fraction Asian (Not Including Pacific Islander)"
label var race_asianpi "Fraction Asian"
label var pov_under_poverty_line "Fraction Below Poverty Line"
label var pop "Population (1000s)"
label var pop_sqmi "Population Per Sq Mile (1000s)"
label var poll_open "Hour of Open of Polling Place"
label var poll_close "Hour of Close of Polling Place"

* Label new wait time and filter variables
label var waittime "Wait Time (Minutes)"
label var wait_over_30min "Wait Time Is Over 30min"
label var upperbound "Upper Bound of Wait Time Measure (Lower Bound plus time before/until surrounding pings)"
label var lowerbound "Lower Bound of Wait Time Measure (Difference between first and last ping in radius)"
label var likelyvoter_v1 "Likely Voter 1: Only show up in Election Day radius on Election Day"
label var likelyvoter_v2 "Likely Voter 2: Only show up in Election Day radius on Election Day"
label var consistentpinger "Consistent Pinger: Records pings on 12 or more unique hours of Election Day"
label var enteredpoll "Entered Poll: Ping recorded in convex hull of Building at some point on Election Day"
label var reasonablevalues "Reasonable Values: Upper bound on wait time is between 1 minute & 2 hours"
label var earliestping "First ping on Election Day in 60m radius (Time of Arrival)"
label var latestping "Last ping on Election Day in 60m radius"
label var hour_of_arrival "Hour of first ping on Election Day in 60m radius (Hour of Arrival)"

save "${data_dir}/voterwaittimes.dta", replace

***************************************************************************************************
*  2. Create county-level waittime dataset and construct & merge to other county-level data
***************************************************************************************************/

**********
* County- & State-Level Vote Shares: 
**********
*   https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/VOQCHQ/FQ9NBF&version=5.0

cd "${master_dir}"
import delimited "${raw_dir}/countypres_2000-2016.csv", clear
drop if fips == "NA"

* Define "Republican Vote Share" as vote for Republican candidate divided by total votes. 
keep if (year == 2012 | year == 2016) & party == "republican"
destring candidatevotes, replace force
bysort state year: egen state_candidatevotes = total(candidatevotes)
bysort state year: egen state_totalvotes = total(totalvotes)
gen county_republican_share = candidatevotes/totalvotes
gen state_republican_share = state_candidatevotes/state_totalvotes
keep fips year county_republican_share state_republican_share totalvotes ///
  state_totalvotes
gen fips_length = length(fips)
replace fips = "0" + fips if fips_length == 4
rename fips statecountyfips
reshape wide state_republican_share county_republican_share totalvotes state_totalvotes, ///
  i(statecountyfips) j(year)
sort statecountyfips
save "${raw_dir}/county_vote_shares.dta", replace

**********
* State Law Variables: Early Voting and Strict ID
**********
* Early / Absentee Voting Laws & Strict ID Laws from the National Conference of State Legislatures 
*  Will follow Cantoni & Pons (2019) in calling "early voting" the intersection of same-day & early
*  voting rules. ("share of elections in which same-day voter registration, automatic voter
*  registration, no-excuse absentee voting, and early voting were available to voters in each state 
*  come from the National Conference of State Legislatures). 4 sets of tables:
* Internet Archive Snapshot from November 8, 2016 (last updated on 10/25/2016) -- Early Voting
* https://web.archive.org/web/20161108225513/http://www.ncsl.org/research/elections-and-campaigns/absentee-and-early-voting.aspx
* Internet Archive Snapshot from November 13, 2016 (last updated on 9/26/2016) -- Strict ID
* https://web.archive.org/web/20161113113845/http://www.ncsl.org/research/elections-and-campaigns/voter-id.aspx
* Internet Archive Snapshot from December 18, 2016 (last updated 11/28/2016)-- Same-Day Registration
* https://web.archive.org/web/20161218120734/http://www.ncsl.org/research/elections-and-campaigns/same-day-registration.aspx
* Internet Archive Snapshot from November 18, 2016 (last updated 11/28/2016)-- Automatic Registration
* https://web.archive.org/web/20161112064023/http://www.ncsl.org/research/elections-and-campaigns/automatic-voter-registration.aspx
* Filled these forms in by hand. 

import delimited "${raw_dir}/NCSL_2016laws.csv", clear
gen earlyvotinglaw = (earlyvoting == 1 | noexcuseabsentee == 1 | allmailvoting == 1 | ///
  samedayreg == 1 | autoreg == 1)
gen strictidlaw = strictid == 1
keep state earlyvotinglaw strictidlaw
sort state
save "${raw_dir}/ncsl_laws.dta", replace

**********
* Chetty-Hendren County-Level Measures
**********
* From "The Effects of Neighborhoods on Intergenerational Mobility II: County Level Estimates"
*      Quarterly Journal of Economics, 133(3): 1163-1228, 2018.
* NOTE: These use 2000 census FIPS geography
* https://opportunityinsights.org/wp-content/uploads/2018/04/online_table4-2.dta
*   causal_p25_cty_kr26: Chetty Causal Effect for P25
*   causal_p25_cty_kr26_se: SE on Chetty Causal Effect for P25
*   causal_p75_cty_kr26: Chetty Causal Effect for P75
*   causal_p75_cty_kr26_se: SE on Chetty Causal Effect for P75
*   cs_race_theil_2000: Racial Segregation
*   cs00_seg_inc : Income Segregation
*   hhinc00: Household Income per Capita
*   gini: Gini Coefficient
*   inc_share_1perc: Top 1% Income Share
*   taxrate: Local Tax Rate
*   scap_ski90pcm: Social Capital Index
*   cty_pop2000: County Population in 2000 Census

use "${raw_dir}/online_table4-2.dta", clear
rename cty2000 statecountyfips
tostring statecountyfips, replace
gen fips_length = length(statecountyfips)
replace statecountyfips = "0" + statecountyfips if fips_length == 4
keep statecountyfips causal_p25_cty_kr26 causal_p25_cty_kr26_se causal_p75_cty_kr26 ///
  causal_p75_cty_kr26_se cs_race_theil_2000_st cs00_seg_inc_st hhinc00_st gini_st ///
  inc_share_1perc_st taxrate_st scap_ski90pcm_st cty_pop2000
sort statecountyfips
save "${raw_dir}/Chetty_County_Effects.dta", replace

**********
* County-Level Demographics (also taken from 2013-2017 ACS, same as block group variables)
**********
import delimited "${raw_dir}/2013-17-ACS-county-data", clear
keep race_white race_black race_hispanic race_asian race_pac_islander race_native race_other ///
  race_multi pop pop_sqmi pov_under_poverty_line county_fips
replace pop = pop / 1000
replace pop_sqmi = pop_sqmi / 1000
gen race_othernonwhite = race_native + race_other + race_multi
gen race_asianpi = race_asian + race_pac_islander
gen race_allnonwhite = race_black + race_asianpi + race_hispanic + race_othernonwhite

rename * county_*
rename county_county_fips statecountyfips
tostring statecountyfips, replace
gen fips_length = length(statecountyfips)
replace statecountyfips = "0" + statecountyfips if fips_length == 4
sort statecountyfips
save "${raw_dir}/county_demo.dta", replace

******
* Create Primary County-Level Dataset
******

cd "${master_dir}"
use "${data_dir}/voterwaittimes.dta", clear
keep if likelyvoter_v1 == 1 & enteredpoll == 1 & consistentpinger == 1 & reasonablevalues == 1

* Construct county-level wait time measures
bysort statecountyfips: egen county_waittime_avg = mean(waittime)
bysort statecountyfips: egen county_waittime_sd = sd(waittime)
bysort statecountyfips: egen county_waittime_N = count(waittime)

bysort statecountyfips: egen county_waitover30_avg = mean(wait_over_30min)
bysort statecountyfips: egen county_waitover30_sd = sd(wait_over_30min)
bysort statecountyfips: egen county_waitover30_N = count(wait_over_30min)

* Construct state-level wait time measures
bysort statefips: egen state_waittime_avg = mean(waittime)
bysort statefips: egen state_waittime_sd = sd(waittime)
bysort statefips: egen state_waittime_N = count(waittime)

* Construct county & state interaction terms (coefficient & standard error)
encode statefips, gen(state_num)
xi i.state_num*race_black, noomit
reg waittime _Istate_num_1-_Istate_num_46 _IstaXrace_1-_IstaXrace_46, noconstant cl(PollingPlace_ID)
gen state_coef = .
gen state_se = .
forval i=1/46 {
    replace state_coef = _b[_IstaXrace_`i'] if state_num == `i'
    replace state_se = _se[_IstaXrace_`i'] if state_num == `i'
}

encode statecountyfips, gen(county_num)
xi i.county_num*race_black, noomit
reg waittime _Icounty_nu_1-_Icounty_nu_2313 _IcouXrac_1-_IcouXrac_2313, noconstant cl(PollingPlace_ID)
gen county_coef = .
gen county_se = .
forval i=1/2313 {
    replace county_coef = _b[_IcouXrac_`i'] if county_num == `i'
    replace county_se = _se[_IcouXrac_`i'] if county_num == `i'
}

* Set county & state coefficient equal to zero if the standard error is zero (i.e. no variation)
for X in any state county: replace X_coef = . if X_se == 0

* Drop down to county level dataset
egen tag_county = tag(statecountyfips)
keep if tag_county == 1
egen tag_state = tag(statefips)

* Create Empirical Bayes adjusted versions of wait time means & disparities at state level
for X in any state county: gen X_waittime_se = (X_waittime_sd / sqrt(X_waittime_N))
ebayes state_waittime_avg state_waittime_se if tag_state == 1, gen(state_waittime_avg_ebayes)
ebayes state_coef state_se if tag_state == 1, gen(state_coef_ebayes)
for X in any state_waittime_avg_ebayes state_coef_ebayes: rename X X1
for X in any state_waittime_avg_ebayes state_coef_ebayes:  bysort statefips: egen X = mode(X1)
drop state_waittime_avg_ebayes1 state_coef_ebayes1

* Create Empirical Bayes adjusted versions of wait time means & disparities at county level
*  Limit to counties that have at least 30 observations
ebayes county_waittime_avg county_waittime_se if county_waittime_N >= 30, gen(county_waittime_avg_ebayes)
ebayes county_coef county_se if county_waittime_N >= 30, gen(county_coef_ebayes)

keep county_* state_* statecountyfips statefips
sort statecountyfips
save "${data_dir}/county_voterwaittimes_limited.dta", replace

**********
* Merge each prior component (Vote Shares, State Laws, Chetty-Hendren, Demos) to this dataset
**********
use "${data_dir}/county_voterwaittimes_limited.dta", clear

* Vote Shares
sort statecountyfips
merge 1:1 statecountyfips using "${raw_dir}/county_vote_shares.dta"
tab _merge
drop if _merge == 2 
drop _merge

* State Laws
rename state_name state
sort state
merge m:1 state using "${raw_dir}/ncsl_laws.dta"
tab _merge
drop if _merge == 2 
drop _merge
rename state state_name

* Chetty-Hendren Measures
sort statecountyfips
merge 1:1 statecountyfips using "${raw_dir}/Chetty_County_Effects.dta"
tab _merge
drop if _merge == 2 
drop _merge

* County Demographics
sort statecountyfips
merge 1:1 statecountyfips using "${raw_dir}/county_demo.dta"
tab _merge
drop if _merge == 2 
drop _merge

**********
* Partisan Identity of Secretary of State
**********
* https://web.archive.org/web/20161023090417/http://en.wikipedia.org/wiki/Secretary_of_state_(U.S._state_government)

gen sec_state_Dem = 0
replace sec_state_Dem = 1 if inlist(state_name,"Alaska","California","Connecticut","Delaware", ///
  "District of Columbia","Illinois","Kentucky","Maine","Massachusetts") == 1
replace sec_state_Dem = 1 if inlist(state_name,"Minnesota","Missouri","Montana","New Hampshire", ///
  "New York","North Carolina","Pennsylvania","Rhode Island","Vermont") == 1
replace sec_state_Dem = 1 if inlist(state_name,"Wisconsin","Virginia","West Virginia") == 1
gen sec_state_Rep = 1-sec_state_Dem 
drop sec_state_Dem

save "${data_dir}/county_voterwaittimes.dta", replace

***************************************************************************************************
*  3. Create Congressional District Level CCES comparison dataset
***************************************************************************************************

**********
* CCES 2016 Survey Data: Construct Wait Time measures (and disparities) and prepare for merge
**********
*  NOTE: For "More than 1 hour" responses, we use 90min

use "${raw_dir}/CCES16_Common_OUTPUT_Feb2018_VV.dta", clear
decode inputstate, gen(state)
gen cd_length = length(cdid115)
gen cd115 = cdid115
replace cd115 = "0" + cd115 if cd_length == 1
gen cces_waittime = 0 if CC16_404 == 1
replace cces_waittime = 5 if CC16_404 == 2
replace cces_waittime = 20 if CC16_404 == 3
replace cces_waittime = 45 if CC16_404 == 4
replace cces_waittime = 90 if CC16_404 == 5

* Keep just people who voted in person on election day
tab CC16_403
tab CC16_403 if race == 2
tab CC16_403 if race != 2
keep if CC16_403 == 1
gen black = race == 2
gen nonblack = race != 2

* Generate congressional district grouping
egen cd = group(state cd115)

* Generate state-level variables for CCES (average wait time and disparity)
bysort state: egen state_cces_waittime_avg = mean(cces_waittime) 
bysort state: egen state_cces_waittime_sd = sd(cces_waittime) 
bysort state: egen state_cces_waittime_N = count(cces_waittime)

bysort state: egen state_cces_waittime_black_avg = mean(cces_waittime / (black == 1)) 
bysort state: egen state_cces_waittime_black_sd = sd(cces_waittime / (black == 1)) 
bysort state: egen state_cces_waittime_black_N = count(cces_waittime / (black == 1))
bysort state: egen state_cces_waittime_nonblack_avg = mean(cces_waittime / (nonblack == 1)) 
bysort state: egen state_cces_waittime_nonblack_sd = sd(cces_waittime / (nonblack == 1)) 
bysort state: egen state_cces_waittime_nonblack_N = count(cces_waittime / (nonblack == 1))

* Wait time disparity
encode state, gen(state_num)
xi i.state_num*black, noomit
reg cces_waittime _Istate_num_1-_Istate_num_51 _IstaXblac_1-_IstaXblac_51, noconstant robust
gen state_cces_coef = .
gen state_cces_se = .
forval i=1/51 {
    replace state_cces_coef = _b[_IstaXblac_`i'] if state_num == `i'
    replace state_cces_se = _se[_IstaXblac_`i'] if state_num == `i'
}

* Generate CD-level variables for CCES  (average wait time and disparity)
bysort cd: egen cd_cces_waittime_avg = mean(cces_waittime) 
bysort cd: egen cd_cces_waittime_sd = sd(cces_waittime) 
bysort cd: egen cd_cces_waittime_N = count(cces_waittime)

bysort cd: egen cd_cces_waittime_black_avg = mean(cces_waittime / (black == 1)) 
bysort cd: egen cd_cces_waittime_black_sd = sd(cces_waittime / (black == 1)) 
bysort cd: egen cd_cces_waittime_black_N = count(cces_waittime / (black == 1))
bysort cd: egen cd_cces_waittime_nonblack_avg = mean(cces_waittime / (nonblack == 1)) 
bysort cd: egen cd_cces_waittime_nonblack_sd = sd(cces_waittime / (nonblack == 1)) 
bysort cd: egen cd_cces_waittime_nonblack_N = count(cces_waittime / (nonblack == 1))

* Wait time disparity
gen cd_num = cd
xi i.cd_num*black, noomit
reg cces_waittime _Icd_num_1-_Icd_num_436 _Icd_Xbla_1-_Icd_Xbla_436, noconstant robust
gen cd_cces_coef = .
gen cd_cces_se = .
forval i=1/436 {
    replace cd_cces_coef = _b[_Icd_Xbla_`i'] if cd_num == `i'
    replace cd_cces_se = _se[_Icd_Xbla_`i'] if cd_num == `i'
}

* Set cd & state coefficient equal to zero if the standard error is zero (i.e. no variation)
for X in any state cd: replace X_cces_coef = . if X_cces_se == 0

* Drop down to CD level dataset
egen tag_cd = tag(cd)
keep if tag_cd == 1
egen tag_state = tag(state)

* Create Empirical Bayes adjusted versions of wait time means & disparities
for X in any state cd: gen X_cces_waittime_se = (X_cces_waittime_sd / sqrt(X_cces_waittime_N))
for X in any state cd: ebayes X_cces_waittime_avg X_cces_waittime_se if tag_X == 1, ///
  gen(X_cces_waittime_avg_ebayes)
for X in any state cd: ebayes X_cces_coef X_cces_se if tag_X == 1, gen(X_cces_coef_ebayes)

for X in any state_cces_waittime_avg_ebayes state_cces_coef_ebayes: rename X X1
for X in any state_cces_waittime_avg_ebayes state_cces_coef_ebayes: bysort state: egen X = mode(X1)
drop state_cces_waittime_avg_ebayes1 state_cces_coef_ebayes1

keep cd115 state cd cd_cces* state_cces*
sort state cd115 
save "${raw_dir}/cces.dta", replace

**********
* Prep individual-level wait time data and then merge to CCES
*********

cd "${master_dir}"
use "${data_dir}/voterwaittimes.dta", clear
keep if likelyvoter_v1 == 1 & enteredpoll == 1 & consistentpinger == 1 & reasonablevalues == 1

* Map a polling place to its congressional district, using Census shapefiles
*  https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.2016.html
merge m:1 PollingPlace_ID using "${raw_dir}/PollingPlaces2016_w_TimeZones_and_Buildings.dta", ///
  keepusing(lat lng)
keep if _m == 3
drop _m
shp2dta using "${raw_dir}/cb_2016_us_cd115_500k.shp", replace data("${raw_dir}/cd_data.dta") ///
  coordinates("${raw_dir}/cd_coords.dta")
geoinpoly lat lng using "${raw_dir}/cd_coords.dta"
merge m:1 _ID using "${raw_dir}/cd_data.dta"
drop if _m == 2
drop _m
rename CD115FP cd115
* Set DC cd115 to "01" instead of "98" to match CCES.
replace cd115 = "01" if cd115 == "98" & statefips == "11"
* Set cd115 to "01" if it's "00" (i.e. only one in state) to match CCES.
replace cd115 = "01" if cd115 == "00"

* Construct congressional district and state-level wait time measures
bysort statefips: egen state_waittime_avg = mean(waittime)
bysort statefips: egen state_waittime_sd = sd(waittime)
bysort statefips: egen state_waittime_N = count(waittime)
bysort statefips cd115: egen cd_waittime_avg = mean(waittime)
bysort statefips cd115: egen cd_waittime_sd = sd(waittime)
bysort statefips cd115: egen cd_waittime_N = count(waittime)

* Construct state-level disparities coefficients:
encode statefips, gen(state_num)
xi i.state_num*race_black, noomit
reg waittime _Istate_num_1-_Istate_num_46 _IstaXrace_1-_IstaXrace_46, noconstant cl(PollingPlace_ID)
gen state_coef = .
gen state_se = .
forval i=1/46 {
    replace state_coef = _b[_IstaXrace_`i'] if state_num == `i'
    replace state_se = _se[_IstaXrace_`i'] if state_num == `i'
}

* Construct congressional-district-level disparities coefficients:
egen cd_num = group(statefips cd115)
xi i.cd_num*race_black, noomit
reg waittime _Icd_num_1-_Icd_num_410 _Icd_Xrac_1-_Icd_Xrac_410, noconstant cl(PollingPlace_ID)
gen cd_coef = .
gen cd_se = .
forval i=1/410 {
    replace cd_coef = _b[_Icd_Xrac_`i'] if cd_num == `i'
    replace cd_se = _se[_Icd_Xrac_`i'] if cd_num == `i'
}

* Set cd & state coefficient equal to zero if the standard error is zero (i.e. no variation)
for X in any state cd: replace X_coef = . if X_se == 0

* Drop down to CD level dataset
egen tag_cd = tag(cd_num)
keep if tag_cd == 1
egen tag_state = tag(statefips)

* Create Empirical Bayes adjusted versions of wait time means & disparities
for X in any state cd: gen X_waittime_se = (X_waittime_sd / sqrt(X_waittime_N))
for X in any state cd: ebayes X_waittime_avg X_waittime_se if tag_X == 1, gen(X_waittime_avg_ebayes)
for X in any state cd: ebayes X_coef X_se if tag_X == 1, gen(X_coef_ebayes)
for X in any state_waittime_avg_ebayes state_coef_ebayes: rename X X1
for X in any state_waittime_avg_ebayes state_coef_ebayes: bysort statefips: egen X = mode(X1)
drop state_waittime_avg_ebayes1 state_coef_ebayes1

* Merge by Congressional District to the CCES and save
rename state_name state
sort state cd115
drop tag_cd
merge 1:1 state cd115 using "${raw_dir}/cces.dta"
tab _merge
drop if _m != 3
rename state state_name
save "${data_dir}/cces_comparison.dta", replace

***************************************************************************************************
*  4. Create datasets for maps produced in ArcGIS
***************************************************************************************************

* Data coverage
use "${data_dir}/voterwaittimes.dta", clear
gen all_filter = 0
replace all_filter = 1 if likelyvoter_v1 == 1 & enteredpoll == 1 & consistentpinger == 1 & ///
  reasonablevalues == 1
gen any_pings = 1
collapse (max) all_filter any_pings, by(PollingPlace_ID)
merge m:1 PollingPlace_ID using "${raw_dir}/PollingPlaces2016_w_TimeZones_and_Buildings.dta"
replace any_pings = 0 if any_pings == .
replace all_filter = 0 if all_filter == .
gen by_hand = 0
replace by_hand = 1 if coord_source == "Uncertain" & building_match_convex == 1
preserve
collapse (mean) any_pings all_filter building_match_convex by_hand, by(fips)
export delimited "${data_dir}/county_poll_stats.csv", replace
restore
keep lat lng any_pings all_filter
export delimited "${data_dir}/polls_by_pings_identified.csv", replace

* Average wait times by congressional district
use "${data_dir}/voterwaittimes.dta" , clear
keep if likelyvoter_v1 == 1 & enteredpoll == 1 & consistentpinger == 1 & reasonablevalues == 1
keep PollingPlace_ID waittime state_name
rename state_name state
merge m:1 PollingPlace_ID using "${raw_dir}/PollingPlaces2016_w_TimeZones_and_Buildings.dta"
keep if _merge == 3
keep PollingPlace_ID waittime lat lng state
shp2dta using "${raw_dir}/cb_2016_us_cd115_500k.shp", ///
  replace data("${raw_dir}/cd_data.dta") coordinates("${raw_dir}/cd_coords.dta")
  *https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.2016.html
geoinpoly lat lng using "${raw_dir}/cd_coords.dta"
merge m:1 _ID using "${raw_dir}/cd_data.dta"
drop if _m == 2

* Empirical Bayes adjusted version of Congressional District Wait Time
bysort GEOID: egen cd_waittime_avg = mean(waittime)
bysort GEOID: egen cd_waittime_sd = sd(waittime)
bysort GEOID: egen cd_waittime_N = count(waittime)
gen cd_waittime_se = (cd_waittime_sd / sqrt(cd_waittime_N))
egen tag_cd = tag(GEOID)
keep if tag_cd == 1
ebayes cd_waittime_avg cd_waittime_se, gen(cd_waittime_avg_ebayes)
keep cd_waittime_avg_ebayes GEOID
rename cd_waittime_avg_ebayes waittime
export delimited "${data_dir}/wait_times_by_cd.csv", replace

log close

stop

